The NYC Rat Sightings dataset describes mostly the location where there are rat sightings, the dates when it happened and which type of building it happened on. In the original dataset there are also many empty columns or columns with only one value, which just means the data was organised poorly, and we will not use these.
We will analyse if sightings depend on the month they happen. Additionally, we will analyse a dataset that describes borough populations so we can analyse if population density compares to number of rat sightings and which boroughs have more density of rat sightings.
The dataset we will be using can be found on kaggle.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
First, we create a function to use for splitting up the different dates.
date.split <- function(txtdate, x){
date <- parse_date(txtdate, format = "%m/%d/%Y %I:%M:%S %p")
time <- parse_time(txtdate, format = "%m/%d/%Y %I:%M:%S %p")
year <- as.factor(format(date, format = "%Y"))
month <- as.factor(format(date, format = "%m"))
day <- as.numeric(format(date, format = "%d"))
if (x == 1) {value <- year}
else if (x == 2) {value <- month}
else if (x == 3) {value <- day}
else if (x == 4) {value <- time}
return(value)
}
Secondly, we create a function to use when splitting a specific number from a data point, as used for both the address and the community boards.
#Second attempt at a function that could split "Incident_Address" and "Community Board":
get.number <- function(address){
if(!identical(address, NA)){
nums <- str_extract(address, "[0-9]+")
return(nums)
}
}
The dataset initially looks like this:
rat_dataset <-
read_csv("Rat_Sightings.csv", show_col_types = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
head(rat_dataset)
## # A tibble: 6 × 52
## Uniqu…¹ Creat…² Close…³ Agency Agenc…⁴ Compl…⁵ Descr…⁶ Locat…⁷ Incid…⁸ Incid…⁹
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 3.15e7 09/04/… 09/18/… DOHMH Depart… Rodent Rat Si… 3+ Fam… 10006 <NA>
## 2 3.15e7 09/04/… 10/28/… DOHMH Depart… Rodent Rat Si… Commer… 10306 2270 H…
## 3 3.15e7 09/04/… <NA> DOHMH Depart… Rodent Rat Si… 1-2 Fa… 10310 758 PO…
## 4 3.15e7 09/04/… 09/14/… DOHMH Depart… Rodent Rat Si… 3+ Fam… 11206 198 SC…
## 5 3.15e7 09/04/… 09/22/… DOHMH Depart… Rodent Rat Si… 3+ Fam… 10462 2138 W…
## 6 3.15e7 09/04/… 09/22/… DOHMH Depart… Rodent Rat Si… 3+ Fam… 11231 179 LU…
## # … with 42 more variables: `Street Name` <chr>, `Cross Street 1` <chr>,
## # `Cross Street 2` <chr>, `Intersection Street 1` <chr>,
## # `Intersection Street 2` <chr>, `Address Type` <chr>, City <chr>,
## # Landmark <lgl>, `Facility Type` <chr>, Status <chr>, `Due Date` <chr>,
## # `Resolution Action Updated Date` <chr>, `Community Board` <chr>,
## # Borough <chr>, `X Coordinate (State Plane)` <dbl>,
## # `Y Coordinate (State Plane)` <dbl>, `Park Facility Name` <chr>, …
with many empty features or with one single value. In this next step, we will be taking these features out and separating some interesting data in more features:
rat_dataset <-
read_csv("Rat_Sightings.csv", show_col_types = FALSE) %>%
#select("Unique Key", "Created Date", "Closed Date", "Location Type", "Incident Zip", "Incident Address", "Street Name", "Cross Street 1", "Cross Street 2", "Intersection Street 1", "Intersection Street 2", "Address Type", "Status", "Due Date", "Resolution Action Updated Date", "Community Board", "Borough", "Latitude", "Longitude") %>%
rename(Unique_Key = "Unique Key", Created_Date = "Created Date", Closed_Date = "Closed Date", Location_Type = "Location Type", Incident_Zip = "Incident Zip", Incident_Address = "Incident Address", Street_Name = "Street Name", Cross_Street_1 = "Cross Street 1", Cross_Street_2 = "Cross Street 2", Intersection_Street_1 = "Intersection Street 1", Intersection_Street_2 = "Intersection Street 2", Address_Type = "Address Type", Status = "Status", Due_Date = "Due Date", RAU_Date = "Resolution Action Updated Date", Community_Board = "Community Board", Borough = "Borough", Latitude = "Latitude", Longitude = "Longitude") %>%
mutate(Created_Year = date.split(Created_Date, 1), Created_Month = date.split(Created_Date, 2), Created_Day = date.split(Created_Date, 3)) %>%
mutate(Closed_Year = date.split(Closed_Date, 1), Closed_Month = date.split(Closed_Date, 2), Closed_Day = date.split(Closed_Date, 3)) %>%
mutate(Due_Year = date.split(Due_Date, 1), Due_Month = date.split(Due_Date, 2), Due_Day = date.split(Due_Date, 3), Due_Time = date.split(Due_Date, 4)) %>%
mutate(RAU_Year = date.split(RAU_Date, 1), RAU_Month = date.split(RAU_Date, 2), RAU_Day = date.split(RAU_Date, 3), RAU_Time = date.split(RAU_Date, 4)) %>%
mutate(Address_Number = get.number(Incident_Address)) %>%
mutate(Community_Board_Number = get.number(Community_Board)) %>%
select(Unique_Key, Created_Year, Created_Month, Created_Day, Closed_Year, Closed_Month, Closed_Day, Location_Type, Incident_Zip, Address_Number, Street_Name, Cross_Street_1, Cross_Street_2, Intersection_Street_1, Intersection_Street_2, Address_Type, Status, Due_Year, Due_Month, Due_Day, Due_Time, RAU_Year, RAU_Month, RAU_Day, RAU_Time, Community_Board_Number, Borough, Latitude, Longitude)
## Warning: One or more parsing issues, see `problems()` for details
We also want to make sure that all of the features have the proper value type.
rat_dataset$Location_Type <-
as.factor(rat_dataset$Location_Type)
rat_dataset$Street_Name <-
as.factor(rat_dataset$Street_Name)
rat_dataset$Cross_Street_1 <-
as.factor(rat_dataset$Cross_Street_1)
rat_dataset$Cross_Street_2 <-
as.factor(rat_dataset$Cross_Street_2)
rat_dataset$Address_Type <-
as.factor(rat_dataset$Address_Type)
rat_dataset$Status <-
as.factor(rat_dataset$Status)
rat_dataset$Borough <-
as.factor(rat_dataset$Borough)
So, our starting point data is:
head(rat_dataset)
## # A tibble: 6 × 29
## Unique_Key Created_Y…¹ Creat…² Creat…³ Close…⁴ Close…⁵ Close…⁶ Locat…⁷ Incid…⁸
## <dbl> <fct> <fct> <dbl> <fct> <fct> <dbl> <fct> <dbl>
## 1 31464015 2015 09 4 2015 09 18 3+ Fam… 10006
## 2 31464024 2015 09 4 2015 10 28 Commer… 10306
## 3 31464025 2015 09 4 <NA> <NA> NA 1-2 Fa… 10310
## 4 31464026 2015 09 4 2015 09 14 3+ Fam… 11206
## 5 31464027 2015 09 4 2015 09 22 3+ Fam… 10462
## 6 31464188 2015 09 4 2015 09 22 3+ Fam… 11231
## # … with 20 more variables: Address_Number <chr>, Street_Name <fct>,
## # Cross_Street_1 <fct>, Cross_Street_2 <fct>, Intersection_Street_1 <chr>,
## # Intersection_Street_2 <chr>, Address_Type <fct>, Status <fct>,
## # Due_Year <fct>, Due_Month <fct>, Due_Day <dbl>, Due_Time <time>,
## # RAU_Year <fct>, RAU_Month <fct>, RAU_Day <dbl>, RAU_Time <time>,
## # Community_Board_Number <chr>, Borough <fct>, Latitude <dbl>,
## # Longitude <dbl>, and abbreviated variable names ¹Created_Year, …
summary(rat_dataset)
## Unique_Key Created_Year Created_Month Created_Day
## Min. :11464394 2016 :17230 07 :11982 Min. : 1.00
## 1st Qu.:23414521 2015 :15272 08 :11656 1st Qu.: 8.00
## Median :28836804 2017 :14425 06 :11307 Median :16.00
## Mean :28158637 2014 :12617 05 :10779 Mean :15.69
## 3rd Qu.:33460138 2013 :10739 09 : 9669 3rd Qu.:23.00
## Max. :37197000 2012 :10643 04 : 8897 Max. :31.00
## (Other):20988 (Other):37624
## Closed_Year Closed_Month Closed_Day
## 2016 :15339 08 :10479 Min. : 1.00
## 2015 :13672 07 :10079 1st Qu.: 8.00
## 2017 :12980 06 : 9595 Median :16.00
## 2014 :12173 09 : 9278 Mean :15.86
## 2011 :10685 05 : 8305 3rd Qu.:23.00
## (Other):26134 (Other):43247 Max. :31.00
## NA's :10931 NA's :10931 NA's :10931
## Location_Type Incident_Zip Address_Number
## 3+ Family Apt. Building :41061 Min. : 83 Length:101914
## 1-2 Family Dwelling :19702 1st Qu.: 10086 Class :character
## Other (Explain Below) :15044 Median : 10472 Mode :character
## 3+ Family Mixed Use Building: 7991 Mean : 10729
## Commercial Building : 5007 3rd Qu.: 11222
## (Other) :13103 Max. :100354
## NA's : 6 NA's :336
## Street_Name Cross_Street_1 Cross_Street_2
## GRAND CONCOURSE : 713 AMSTERDAM AVENUE: 1617 BROADWAY: 2053
## BROADWAY : 705 BROADWAY : 1494 BEND : 1456
## EASTERN PARKWAY : 632 BEND : 1312 1 AVENUE: 1082
## LAFAYETTE AVENUE: 379 2 AVENUE : 1128 7 AVENUE: 943
## GREENE AVENUE : 366 3 AVENUE : 1113 8 AVENUE: 914
## (Other) :90044 (Other) :78594 (Other) :78777
## NA's : 9075 NA's :16656 NA's :16689
## Intersection_Street_1 Intersection_Street_2 Address_Type
## Length:101914 Length:101914 ADDRESS :78090
## Class :character Class :character BLOCKFACE : 4385
## Mode :character Mode :character INTERSECTION: 8777
## LATLONG :10281
## PLACENAME : 35
## NA's : 346
##
## Status Due_Year Due_Month Due_Day
## Assigned: 4642 2016 :17243 08 :11929 Min. : 1.00
## Closed :74025 2017 :15343 07 :11691 1st Qu.: 8.00
## Draft : 1 2015 :15012 09 :11263 Median :16.00
## Open : 1 2014 :12391 06 :10472 Mean :15.67
## Pending :23245 2013 :10858 10 : 9932 3rd Qu.:23.00
## (Other):30950 (Other):46510 Max. :31.00
## NA's : 117 NA's : 117 NA's :117
## Due_Time RAU_Year RAU_Month RAU_Day
## Length:101914 2016 :17375 07 :12227 Min. : 1.00
## Class1:hms 2015 :15361 08 :11537 1st Qu.: 8.00
## Class2:difftime 2017 :14505 06 :11340 Median :16.00
## Mode :numeric 2014 :12557 09 :10657 Mean :15.82
## 2011 :11160 05 : 9425 3rd Qu.:23.00
## (Other):30953 (Other):46725 Max. :31.00
## NA's : 3 NA's : 3 NA's :3
## RAU_Time Community_Board_Number Borough Latitude
## Length:101914 Length:101914 BRONX :20706 Min. :40.50
## Class1:hms Class :character BROOKLYN :34673 1st Qu.:40.68
## Class2:difftime Mode :character MANHATTAN :26803 Median :40.73
## Mode :numeric QUEENS :14811 Mean :40.74
## STATEN ISLAND: 4920 3rd Qu.:40.82
## Unspecified : 1 Max. :40.91
## NA's :706
## Longitude
## Min. :-74.25
## 1st Qu.:-73.97
## Median :-73.94
## Mean :-73.93
## 3rd Qu.:-73.90
## Max. :-73.70
## NA's :706
To answer this question, we will import an extension of the
ggplot2 library, the ggmap library. If you
don’t have the ggmap package installed, please do so by
running install.packages("ggmap") on your R console.
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
KEY = "AIzaSyDZb4hXrsdgQ7zjHoT1h6A26G7YbCV2jBc"
register_google(key = KEY)
Firstly, we take a look at how the sightings are distributed:
ggplot(rat_dataset, mapping = aes(y = Longitude)) +
geom_boxplot() +
theme_minimal()
## Warning: Removed 706 rows containing non-finite values (stat_boxplot).
ggplot(rat_dataset, mapping = aes(y = Latitude)) +
geom_boxplot() +
theme_minimal()
## Warning: Removed 706 rows containing non-finite values (stat_boxplot).
There is a concentration in the middle of the range, with many of
what geom_boxplot() considers outliers. So the part of New
York we will be looking at is:
data4lat <-
rat_dataset %>%
select(Latitude) %>%
filter(!is.na(Latitude))
max_lat <- max(data4lat)
min_lat <- min(data4lat)
data4lon <-
rat_dataset %>%
select(Longitude) %>%
filter(!is.na(Longitude))
max_lon <- max(data4lon)
min_lon <- min(data4lon)
newyork.map <- get_map(location= 'New York')
## Source : https://maps.googleapis.com/maps/api/staticmap?center=New%20York&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New+York&key=xxx
sightings_map <-
ggmap(newyork.map) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
sightings_map
## Warning: Removed 1 rows containing missing values (geom_rect).
From the distribution on the boxplots, we can assume that most of the sightings are concentrated in the middle of the map, which we can confirm by plotting the sightings. There is also a smaller concentration at the top middle of the map.
sightings_map <-
ggmap(newyork.map) +
geom_point(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude), alpha = 0.1) +
stat_density2d(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude)) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
sightings_map
## Warning: Removed 706 rows containing non-finite values (stat_density2d).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 706 rows containing missing values (geom_point).
It seems like the sightings are mostly concentrated in 3 boroughs: Manhatten, Bronx and Brooklyn, with Staten Island having the least concentration by far.
borough_population <-
rat_dataset %>%
ggplot(aes(x = Borough, fill = Borough)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-1) +
scale_fill_manual(values=c("#882E72", "#B178A6", "#114477", "#1965B0",
"#5289C7", "#7BAFDE", "#4EB265", "#90C987",
"#CAE0AB", "#F7EE55", "#F6C141", "#F1932D",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA",
"#77AADD", "#117755", "#44AA88", "#99CCBB",
"#777711")) +
ylim(0, 40000) +
labs(x = "Borough", y = "Total Number of Rat Sightings",
title = "Number of Rat Sightings in Each Borough") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
borough_population
It looks like Brooklyn is the borough with the most rat sightings. Even though the map shows that Manhatten, for example, looks more densely populated, Manhatten is also a bigger borough, so the sightings are more evenly distributed throughout it.
data4lat_brooklyn <-
rat_dataset %>%
filter(!is.na(Latitude)) %>%
filter(Borough == "BROOKLYN") %>%
select(Latitude)
max_brooklyn_lat <- max(data4lat_brooklyn)
min_brooklyn_lat <- min(data4lat_brooklyn)
data4lon_brooklyn <-
rat_dataset %>%
filter(!is.na(Longitude)) %>%
filter(Borough == "BROOKLYN") %>%
select(Longitude)
max_brooklyn_lon <- max(data4lon_brooklyn)
min_brooklyn_lon <- min(data4lon_brooklyn)
brooklyn_rats <-
rat_dataset %>%
filter(Borough == "BROOKLYN")
brooklyn_map <-
ggmap(newyork.map) +
geom_point(data = brooklyn_rats, mapping = aes(x = Longitude, y = Latitude), color = "Red", size = 0.1) +
ylim(min_brooklyn_lat, max_brooklyn_lat) +
xlim(min_brooklyn_lon, max_brooklyn_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
brooklyn_map
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 206 rows containing missing values (geom_point).
The rats seem to stand away from the beach and the parks (they probably have work to do, like us). The denser the subway lines get, the denser the rat population too, it seems.
location_map <-
ggmap(newyork.map) +
geom_point(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
facet_wrap(~ Location_Type) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map
## Warning: Removed 21 rows containing missing values (geom_rect).
## Warning: Removed 706 rows containing missing values (geom_point).
That does not seem like a great visualization (as you really can’t see anything), let’s divide our facets into multiple plots so we can take a closer look at each of them.
location_subset1 <- subset(rat_dataset, Location_Type %in% c("1-2 Family Dwelling", "1-2 Family Mixed Use Building", "3+ Family Apt. Building", "3+ Family Mixed Use Building"))
location_map1 <-
ggmap(newyork.map) +
geom_point(data = location_subset1, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
facet_wrap(~ Location_Type, nrow = 2) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map1
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 195 rows containing missing values (geom_point).
location_subset2 <- subset(rat_dataset, Location_Type %in% c("Catch Basin/Sewer", "Commercial Building", "Construction Site", "Day Care/Nursery"))
location_map2 <-
ggmap(newyork.map) +
geom_point(data = location_subset2, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
facet_wrap(~ Location_Type, nrow = 2) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map2
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 90 rows containing missing values (geom_point).
location_subset3 <- subset(rat_dataset, Location_Type %in% c("Government Building", "Hospital", "Office Building", "Other (Explain Below)"))
location_map3 <-
ggmap(newyork.map) +
geom_point(data = location_subset3, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
facet_wrap(~ Location_Type, nrow = 2) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map3
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 348 rows containing missing values (geom_point).
location_subset4 <- subset(rat_dataset, Location_Type %in% c("Parking Lot/Garage", "Public Garden", "Public Stairs", "School/Pre-School"))
location_map4 <-
ggmap(newyork.map) +
geom_point(data = location_subset4, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
facet_wrap(~ Location_Type, nrow = 2) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map4
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 26 rows containing missing values (geom_point).
location_subset5 <- subset(rat_dataset, Location_Type %in% c("Single Room Occupancy (SRO)", "Summer Camp", "Vacant Building", "Vacant Lot"))
location_map5 <-
ggmap(newyork.map) +
geom_point(data = location_subset5, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
facet_wrap(~ Location_Type, nrow = 2) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map5
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 47 rows containing missing values (geom_point).
Looking at the Family Buildings, it is actually scary how evenly spread out (and still, always concentrated) around the maps. Even in Staten Island, which we already saw had the least concentration, we can still see that most of them were in Family Buildings. Looking at the map for the sightings at Sewers, it is no surprise the bigest concentration is in Manhatten, which is famously known in movies for having rats come out of sewers, as well as walking around stealing Dolar Slices, so you can see that Manhatten is also the one with more concentration in Public Spaces. It is also no surprise most of the sightings at Government Buildings are in Manhatten, as most of the Government buildings in New York are in Downtown Manhatten. There are no rat sightings in Office Buildings in Staten Island, as it is a Borough known to be mostly residential. For the “Other (Explain Below)”, it makes sense for this one to be the most densely populated (as there are many building types not listed as the primary ones) and the one with the sightings more evenly distributed (as there are “other” types of buildings everywhere).
years_map <-
ggmap(newyork.map) +
geom_point(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude, color = Created_Year), alpha = 0.3) +
facet_wrap(~ Created_Year) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
years_map
## Warning: Removed 8 rows containing missing values (geom_rect).
## Warning: Removed 706 rows containing missing values (geom_point).
It definitely looks like that is not the case, as the maps all look basically exactly the same, meaning the actual locations of the sightings are constant, which may indicate to no interventions ever having been done in any of the places.
Check how many location types it is.
unique(rat_dataset$Location_Type)
## [1] 3+ Family Mixed Use Building Commercial Building
## [3] 1-2 Family Dwelling 3+ Family Apt. Building
## [5] Public Stairs Other (Explain Below)
## [7] Vacant Lot Construction Site
## [9] Hospital Parking Lot/Garage
## [11] Catch Basin/Sewer Vacant Building
## [13] 1-2 Family Mixed Use Building Public Garden
## [15] Government Building Office Building
## [17] School/Pre-School Day Care/Nursery
## [19] Single Room Occupancy (SRO) Summer Camp
## [21] <NA>
## 20 Levels: 1-2 Family Dwelling ... Vacant Lot
# 20 location types
There are 20 location types.
Compute the percentage of rat sighting in each location type
loc_per <- rat_dataset %>%
group_by(Location_Type) %>%
summarise(Percentage = round(100*(n() / nrow(.)), 1))
loc_per
## # A tibble: 21 × 2
## Location_Type Percentage
## <fct> <dbl>
## 1 1-2 Family Dwelling 19.3
## 2 1-2 Family Mixed Use Building 1.7
## 3 3+ Family Apt. Building 40.3
## 4 3+ Family Mixed Use Building 7.8
## 5 Catch Basin/Sewer 1.1
## 6 Commercial Building 4.9
## 7 Construction Site 2.2
## 8 Day Care/Nursery 0.1
## 9 Government Building 0.3
## 10 Hospital 0.1
## # … with 11 more rows
Plot the bar chart that reveals the occupation of rat sighting in each location type
library(ggcharts)
loc_per %>%
bar_chart(Location_Type, Percentage,
bar_color = c("#882E72", "#B178A6", "#114477", "#1965B0",
"#5289C7", "#7BAFDE", "#4EB265", "#90C987",
"#CAE0AB", "#F7EE55", "#F6C141", "#F1932D",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA",
"#77AADD", "#117755", "#44AA88", "#99CCBB",
"#777711")) +
geom_text(aes(label = paste(Percentage , "%")),
hjust = -0.2) +
ylim(NA, 45) +
theme(plot.title = element_text(hjust=0.5)) +
labs(x = "", y = "", title = "Percentage of Rat Sighting in Different Location Type",
fill = "Location Type")
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
Following the implications of the bar chart, the most frequent cases of rat sightings occurred in apartment buildings with 3 or more family members, accounting for 40.3%. This is followed by residences with 1 to 2 family members at 19.3%. Notably, mixed-use buildings with 3 or more family members also accounted for 7.8%. Overall, the types of locations associated with families held the most complaints of rat sightings.
Plot the number of rat sightings each month
rat_dataset %>%
ggplot(aes(x = Created_Month, fill = Created_Month)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-1) +
scale_fill_manual(values=c("#882E72", "#B178A6", "#114477", "#1965B0",
"#5289C7", "#7BAFDE", "#4EB265", "#90C987",
"#CAE0AB", "#F7EE55", "#F6C141", "#F1932D",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA",
"#77AADD", "#117755", "#44AA88", "#99CCBB",
"#777711")) +
ylim(0, 15000) +
labs(x = "Month", y = "Total Number of Rat Sighting",
title = "Number of Rat Sighting Each Month", fill = "Month") +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5))
In July, the highest number of rat sightings was recorded, with 11,982 cases, followed by August and June. In general, the number of rat sightings can be identified in relation to fluctuations in temperature, as in winter months, i.e. November, December, January and February, the number of rat sightings are the lowest. With rising temperatures, however, the number of rat sightings increased markedly.
Plot the Percentage of rat sightings by month
# compute the percentage of each month on rat sighting
month_per <- rat_dataset %>%
group_by(Created_Month) %>%
summarise(Percentage = round(100*(n() / nrow(.)),1))
month_per
## # A tibble: 12 × 2
## Created_Month Percentage
## <fct> <dbl>
## 1 01 5.6
## 2 02 5.8
## 3 03 7.6
## 4 04 8.7
## 5 05 10.6
## 6 06 11.1
## 7 07 11.8
## 8 08 11.4
## 9 09 9.5
## 10 10 7.8
## 11 11 5.3
## 12 12 4.8
month_per %>%
mutate(Month = Created_Month,
Month = recode_factor(Month, '01' = "January", '02' = "Febuary",
'03' = "March", '04' = "April", '05' = "May", '06' = "June",
'07' = "July", '08' = "August", '09' = "September",
'10' = "October", '11' = "November", '12' = "December")) %>%
bar_chart(Month, Percentage,
bar_color = c("#882E72", "#B178A6", "#114477", "#1965B0",
"#5289C7", "#7BAFDE", "#4EB265", "#90C987",
"#CAE0AB", "#F7EE55", "#F6C141", "#F1932D")) +
geom_text(aes(label = paste(Percentage , "%")),
hjust = -0.2) +
ylim(NA, 15) +
theme(plot.title = element_text(hjust=0.5)) +
labs(x = "", y = "Percentage", title = "Percentage of Rat Sighting in Each Month",
fill = "Month")
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
A monthly percentage of rat sightings clearly reveals that rat sightings can be roughly divided by season. Rat sightings are highest in the summer months, followed by spring and fall, and lowest in the winter months.
Plot the number of rat sightings versus year
rat_dataset %>%
ggplot(aes(x = Created_Year, fill = Created_Year)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-1) +
scale_fill_manual(values=c("#882E72", "#B178A6", "#5289C7", "#7BAFDE",
"#CAE0AB", "#F7EE55", "#E8601C", "#DC050C")) +
ylim(0, 20000) +
labs(x = "Year", y = "Total Number of Rat Sighting",
title = "Number of Rat Sighting 2010 - 2017", fill = "Year") +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
Since 2010 to 2013, the number of rat sighting cases did not fluctuate measurably. Yet, the number of rat sightings escalated yearly after 2013, from 2014 to 2016, with the number of rat sightings growing from 10,534 cases in 2010 to 17,230 cases in 2016. Possible reasons behind this are that New York residents tend to report rat sightings to the administration year by year or that the population of New York rises over the period of the years, thus causing the rat population to rise with it.
Another thing that needs to be mentioned here is that data collection was interrupted in September 2017, hence the drop in rat sightings in 2017.
Plot the line chart of rat sightings versus year to more easily observe changes
# Year with rat sighting (count)
year_count <- rat_dataset %>%
group_by(Created_Year) %>%
summarise(Number = n())
year_count
## # A tibble: 8 × 2
## Created_Year Number
## <fct> <int>
## 1 2010 10534
## 2 2011 10454
## 3 2012 10643
## 4 2013 10739
## 5 2014 12617
## 6 2015 15272
## 7 2016 17230
## 8 2017 14425
# Year with rat sighting (count)
year_count %>%
ggplot(aes(x = Created_Year, y = Number, group = 1)) +
geom_point(size = 1.5) +
geom_line(color = "dark blue", size = 1) +
labs(x = "Year", y = "Number of Rat Sighting",
title = "Number of Rat Sighting 2010 - 2017", fill = "Year") +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
The line graph offers a better experience in observing the trend of rat sightings by year. The line graph reaffirms the above discussion, with an alarming increase in rat sightings from 2013 to 2016.
Mapping the number of rat sightings for each month of each year
# Number of Rat Sighting in each Months of Each Year
year_month_count <- rat_dataset %>%
group_by(Created_Year) %>%
count(Created_Month)
# Number of Rat Sighting in each Months of Each Year
year_month_count %>%
ggplot(aes(x = Created_Month, y = n, color = Created_Year, group = Created_Year)) +
geom_point() +
geom_line() +
scale_color_manual(values=c("#882E72", "black", "#EEA236", "#5CB85C",
"#46B8DA", "#9632B8", "dark green", "#DC050C")) +
labs(x = "Month", y = "Number of Rat Sighting",
title = "Number of Rat Sighting", color = "Year") +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
The graph reflects that the profile of the number of rat sightings is similar from year to year, with the most frequent rat sightings usually happening in the summer months as opposed to the less frequent rat sightings usually falling in the winter months. On the other hand, the number of rat sightings rose over time, in the case of 2017, even though the data are not yet complete, it remained noticeable that the overall number of rat sightings per month was much higher than in previous years.
Compute the length of time to solve a complaint of a rat sighting
# combine the year, month, and day of started day
starttime <- as.Date(with(rat_dataset, paste(Created_Year, Created_Month,
Created_Day, sep = "-")), "%Y-%m-%d")
# combine the year, month, and day of closed day
endtime <- as.Date(with(rat_dataset, paste(Closed_Year, Closed_Month,
Closed_Day, sep = "-")), "%Y-%m-%d")
# compute for the duration taken to solve
solved_duration <- data.frame(difftime(endtime,starttime,units = "days"))
colnames(solved_duration)[1] <- "duration"
solved_duration$duration <- as.numeric(solved_duration$duration)
# merge year created year with solved time, in order to see if the government's efficient improve
solved_duration <- cbind(rat_dataset$Created_Year, solved_duration)
colnames(solved_duration)[1] <- "Sighting_Year"
# remove the negative number, as it cannot be negative time to solve the issue
solved_duration <- solved_duration %>%
mutate(month_cat = cut(duration, breaks = c(0, 30, 60, 90, 120, 150, 180, 1000),
labels = c("less than 1 month", "less than 2 month",
"less than 3 month", "less than 4 month",
"less than 5 month", "less than half year",
"more than half year")))
# merge it to the original dataset
rat_dataset <- cbind(rat_dataset, solved_duration)
solved_duration <-
solved_duration %>%
drop_na()
Plot the time required to solve the problem
solved_duration %>%
ggplot(aes(x = month_cat, fill = month_cat)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-1) +
scale_fill_manual(values=c("#882E72", "#5289C7", "#CAE0AB",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA")) +
ylim(0, 70000) +
labs(x = "Time to Solve", y = "Cases Count",
title = "Time to Solve Rat Sighting", fill = "Month") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0),
plot.title = element_text(hjust=0.5))
Apparently, the majority of rat sighting complaints were resolved within 1 month, with 64,371 cases, followed by less than 2 months, with 5,599 cases, suggesting that the administration is efficient in tackling rat sighting problems.
duration_map <-
ggmap(newyork.map) +
geom_point(data = rat_dataset %>% filter_at(vars(Longitude, Latitude, month_cat),all_vars(!is.na(.))), mapping = aes(x = Longitude, y = Latitude, color = month_cat), alpha = 0.3) +
facet_wrap(~ month_cat) +
ylim(min_lat, max_lat) +
xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
duration_map
## Warning: Removed 7 rows containing missing values (geom_rect).
The cases that take longer than 4 months to be closed are mostly localised in Manhatten, which makes sense, because, as you will see later, Mahantten is the borough with the biggest rat/population ratio.
Plot the number of rat sighting case resolution times by year.
# plot the duration taken to solved the issue of each year
solved_duration %>%
filter(Sighting_Year == c(2010, 2011, 2012, 2013)) %>%
ggplot(aes(x = month_cat, fill = month_cat)) +
geom_bar(position=position_dodge(preserve = "single"), stat='count') +
geom_text(stat='count', aes(label=..count..), vjust=-1) +
scale_fill_manual(values=c("#882E72", "#5289C7", "#CAE0AB",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA")) +
ylim(0, 2300) +
labs(x = "Time to Solve", y = "Cases Count",
title = "Time to Solve Rat Sighting", fill = "Month") +
facet_grid(.~Sighting_Year) +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0),
plot.title = element_text(hjust=0.5))
# plot the duration taken to solved the issue of each year
solved_duration %>%
filter(Sighting_Year == c(2014, 2015, 2016, 2017)) %>%
ggplot(aes(x = month_cat, fill = month_cat)) +
geom_bar(position=position_dodge(preserve = "single"), stat='count') +
geom_text(stat='count', aes(label=..count..), vjust= -1) +
scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
labels = c("less than 1 month", "less than 2 month",
"less than 3 month", "less than 4 month",
"less than 5 month", "less than half year",
"more than half year")) +
ylim(0, 3000) +
labs(x = "Time to Solve", y = "Cases Count",
title = "Time to Solve Rat Sighting", fill = "Month") +
facet_grid(.~Sighting_Year) +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0),
plot.title = element_text(hjust=0.5))
On balance, from 2010 to 2017, an increasing number of sighting cases were solved within 1 month, with the number of cases requiring more than 2 months to be closed decreasing each year.
Plot the percentage of rat sighting case resolution times by year
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
solved_duration %>%
filter(Sighting_Year == c(2010, 2011, 2012), duration > 0) %>%
ggplot(aes(x= month_cat, group=Sighting_Year)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes(label = scales::percent(round((..prop..),3)),
y= ..prop.. ),
stat= "count", vjust = -0.5, hjust = 0.5, angle = 0,
posistion = position_dodge(width=0.9), size = 3) +
scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
labels = c("less than 1 month", "less than 2 month",
"less than 3 month", "less than 4 month",
"less than 5 month", "less than half year",
"more than half year")) +
ylim(0, 100) +
labs(x = "Time to Solve", y = "Cases Count",
title = "Time to Solve Rat Sighting", fill = "Month") +
facet_grid(~Sighting_Year) +
scale_y_continuous(labels=percent) +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0),
plot.title = element_text(hjust=0.5))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
solved_duration %>%
filter(Sighting_Year == c(2013, 2014, 2015)) %>%
ggplot(aes(x= month_cat, group=Sighting_Year)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes(label = scales::percent(round((..prop..),3)),
y= ..prop.. ),
stat= "count", vjust = -0.5, hjust = 0.5, angle = 0,
posistion = position_dodge(width=0.9), size = 3) +
scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
labels = c("less than 1 month", "less than 2 month",
"less than 3 month", "less than 4 month",
"less than 5 month", "less than half year",
"more than half year")) +
ylim(0, 100) +
labs(x = "Time to Solve", y = "Cases Count",
title = "Time to Solve Rat Sighting", fill = "Month") +
facet_grid(~Sighting_Year) +
scale_y_continuous(labels=percent) +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0),
plot.title = element_text(hjust=0.5))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
solved_duration %>%
filter(Sighting_Year == c(2016, 2017)) %>%
ggplot(aes(x= month_cat, group=Sighting_Year)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes(label = scales::percent(round((..prop..),3)),
y= ..prop.. ),
stat= "count", vjust = -0.5, hjust = 0.5, angle = 0,
posistion = position_dodge(width=0.9), size = 4) +
scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB",
"#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
labels = c("less than 1 month", "less than 2 month",
"less than 3 month", "less than 4 month",
"less than 5 month", "less than half year",
"more than half year")) +
ylim(0, 100) +
labs(x = "Time to Solve", y = "Cases Count",
title = "Time to Solve Rat Sighting", fill = "Month") +
facet_grid(~Sighting_Year) +
scale_y_continuous(labels=percent) +
theme(axis.text.x = element_text(angle = 90,
hjust = 1, vjust = 0),
plot.title = element_text(hjust=0.5))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
As seen in the graph, the overall efficiency of solving rat sighting cases has improved year by year, from 73.2% of cases resolved within 1 month in 2010 to approximately 80% of cases closed within 1 month in 2011, 2012, and further to 90% of cases settled within 1 month in 2013 to 2017.
Quick side note: 2008 and 2009 have been ignored because they only have 18 and 1 rat sightings respectively. Also, when looking up the data; some borough have different names as counties, but exactly the same area, so one should look up the county names instead.
Borough_Rats <-
rat_dataset %>%
group_by(Borough, Due_Year) %>%
tally(sort = TRUE) %>%
pivot_wider(names_from = Due_Year, values_from = n) %>%
select(Borough, "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017") %>%
drop_na()
head(Borough_Rats)
## # A tibble: 5 × 9
## # Groups: Borough [5]
## Borough `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017`
## <fct> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 BROOKLYN 3237 3631 3574 3359 3887 5136 5999 5795
## 2 MANHATTAN 2734 2410 2812 3053 3467 4063 4648 3563
## 3 BRONX 1938 2145 2145 2110 2721 3158 3455 3026
## 4 QUEENS 1511 1582 1512 1687 1713 2065 2409 2316
## 5 STATEN ISLAND 619 550 530 649 603 590 732 643
Borough_Population <-
read_csv("New_York_City_Population_by_Borough__1950_-_2040.csv", show_col_types = FALSE) %>%
select("Borough", "Area", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017") %>%
mutate(Borough = toupper(Borough)) %>%
inner_join(Borough_Rats, by = "Borough") %>%
rename(Borough = "Borough", Area = "Area", Population_2010 = "2010.x", Rat_Sightings_2010 = "2010.y", Population_2011 = "2011.x", Rat_Sightings_2011 = "2011.y", Population_2012 = "2012.x", Rat_Sightings_2012 = "2012.y", Population_2013 = "2013.x", Rat_Sightings_2013 = "2013.y", Population_2014 = "2014.x", Rat_Sightings_2014 = "2014.y", Population_2015 = "2015.x", Rat_Sightings_2015 = "2015.y", Population_2016 = "2016.x", Rat_Sightings_2016 = "2016.y", Population_2017 = "2017.x", Rat_Sightings_2017 = "2017.y") %>%
relocate(Borough, Area, Population_2010, Rat_Sightings_2010, Population_2011, Rat_Sightings_2011, Population_2012, Rat_Sightings_2012, Population_2013, Rat_Sightings_2013, Population_2014, Rat_Sightings_2014, Population_2015, Rat_Sightings_2015, Population_2016, Rat_Sightings_2016, Population_2017, Rat_Sightings_2017) #%>%
## Warning: One or more parsing issues, see `problems()` for details
#mutate(Sightings_Rate = round(((Rat_Sightings / Population) * 100000), digits = 2))
head(Borough_Population)
## # A tibble: 5 × 18
## Borough Area Popul…¹ Rat_S…² Popul…³ Rat_S…⁴ Popul…⁵ Rat_S…⁶ Popul…⁷ Rat_S…⁸
## <chr> <dbl> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
## 1 BRONX 110 1385108 1938 1396954 2145 1411087 2145 1421498 2110
## 2 BROOKLYN 183. 2552911 3237 2540918 3631 2568538 3574 2587759 3359
## 3 MANHATT… 59.1 1585873 2734 1608717 2410 1624573 2812 1628379 3053
## 4 QUEENS 280 2250002 1511 2255261 1582 2271920 1512 2286788 1687
## 5 STATEN … 152 468730 619 471014 550 470597 530 471783 649
## # … with 8 more variables: Population_2014 <dbl>, Rat_Sightings_2014 <int>,
## # Population_2015 <dbl>, Rat_Sightings_2015 <int>, Population_2016 <dbl>,
## # Rat_Sightings_2016 <int>, Population_2017 <dbl>, Rat_Sightings_2017 <int>,
## # and abbreviated variable names ¹Population_2010, ²Rat_Sightings_2010,
## # ³Population_2011, ⁴Rat_Sightings_2011, ⁵Population_2012,
## # ⁶Rat_Sightings_2012, ⁷Population_2013, ⁸Rat_Sightings_2013
##Creating two new databases; one for rats per population and one for rats per area
Rats_Per_Pop <-
Borough_Population %>%
mutate('2010' = round(((Rat_Sightings_2010 / Population_2010) * 100000), digits = 2), '2011' = round(((Rat_Sightings_2011 / Population_2011) * 100000), digits = 2), '2012' = round(((Rat_Sightings_2012 / Population_2012) * 100000), digits = 2), '2013' = round(((Rat_Sightings_2013 / Population_2013) * 100000), digits = 2), '2014' = round(((Rat_Sightings_2014 / Population_2014) * 100000), digits = 2), '2015' = round(((Rat_Sightings_2015 / Population_2015) * 100000), digits = 2), '2016' = round(((Rat_Sightings_2016 / Population_2016) * 100000), digits = 2), '2017' = round(((Rat_Sightings_2017 / Population_2017) * 100000), digits = 2)) %>%
select(Borough, '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017')
head(Rats_Per_Pop)
## # A tibble: 5 × 9
## Borough `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BRONX 140. 154. 152. 148. 190. 219. 239. 210.
## 2 BROOKLYN 127. 143. 139. 130. 149. 197. 230. 223.
## 3 MANHATTAN 172. 150. 173. 187. 212. 248. 284. 0
## 4 QUEENS 67.2 70.2 66.6 73.8 74.5 89.6 104. 101.
## 5 STATEN ISLAND 132. 117. 113. 138. 128. 125. 154. 135.
Rats_Per_Km <-
Borough_Population %>%
mutate('2010' = round((Rat_Sightings_2010 / Area), digits = 2), '2011' = round((Rat_Sightings_2011 / Area), digits = 2), '2012' = round((Rat_Sightings_2012 / Area), digits = 2), '2013' = round((Rat_Sightings_2013 / Area), digits = 2), '2014' = round((Rat_Sightings_2014 / Area), digits = 2), '2015' = round((Rat_Sightings_2015 / Area), digits = 2), '2016' = round((Rat_Sightings_2016 / Area), digits = 2), '2017' = round((Rat_Sightings_2017 / Area), digits = 2)) %>%
select(Borough, '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017')
head(Rats_Per_Km)
## # A tibble: 5 × 9
## Borough `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 BRONX 17.6 19.5 19.5 19.2 24.7 28.7 31.4 27.5
## 2 BROOKLYN 17.6 19.8 19.5 18.3 21.2 28 32.7 31.6
## 3 MANHATTAN 46.3 40.8 47.6 51.7 58.7 68.8 78.6 60.3
## 4 QUEENS 5.4 5.65 5.4 6.03 6.12 7.38 8.6 8.27
## 5 STATEN ISLAND 4.07 3.62 3.49 4.27 3.97 3.88 4.82 4.23
##Visualization for above
P_Rates <-
Rats_Per_Pop %>%
pivot_longer(c('2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'), names_to = "Year", values_to = "Rate")
as.factor(P_Rates$Year)
## [1] 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015 2016
## [16] 2017 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015
## [31] 2016 2017 2010 2011 2012 2013 2014 2015 2016 2017
## Levels: 2010 2011 2012 2013 2014 2015 2016 2017
head(P_Rates, n = 20)
## # A tibble: 20 × 3
## Borough Year Rate
## <chr> <chr> <dbl>
## 1 BRONX 2010 140.
## 2 BRONX 2011 154.
## 3 BRONX 2012 152.
## 4 BRONX 2013 148.
## 5 BRONX 2014 190.
## 6 BRONX 2015 219.
## 7 BRONX 2016 239.
## 8 BRONX 2017 210.
## 9 BROOKLYN 2010 127.
## 10 BROOKLYN 2011 143.
## 11 BROOKLYN 2012 139.
## 12 BROOKLYN 2013 130.
## 13 BROOKLYN 2014 149.
## 14 BROOKLYN 2015 197.
## 15 BROOKLYN 2016 230.
## 16 BROOKLYN 2017 223.
## 17 MANHATTAN 2010 172.
## 18 MANHATTAN 2011 150.
## 19 MANHATTAN 2012 173.
## 20 MANHATTAN 2013 187.
P_Rates %>%
ggplot(aes(x=Year, y=Rate, color = Borough, group = Borough)) +
geom_point() +
geom_line() +
labs(x = "Year", y = "Sightings per 100.000 population", title = "Rate of sightings by Borough population", color = "Borough") +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
P_Rates %>%
ggplot(aes(x=Year, y=Rate, group = Borough)) +
geom_point() +
geom_line() +
facet_wrap(~ Borough) +
labs(x = "Year", y = "Sightings per 100.000 population", title = "Rate of sightings by Borough population") +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
A_Rates <-
Rats_Per_Km %>%
pivot_longer(c('2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'), names_to = "Year", values_to = "Rate")
as.factor(A_Rates$Year)
## [1] 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015 2016
## [16] 2017 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015
## [31] 2016 2017 2010 2011 2012 2013 2014 2015 2016 2017
## Levels: 2010 2011 2012 2013 2014 2015 2016 2017
head(A_Rates, n = 20)
## # A tibble: 20 × 3
## Borough Year Rate
## <chr> <chr> <dbl>
## 1 BRONX 2010 17.6
## 2 BRONX 2011 19.5
## 3 BRONX 2012 19.5
## 4 BRONX 2013 19.2
## 5 BRONX 2014 24.7
## 6 BRONX 2015 28.7
## 7 BRONX 2016 31.4
## 8 BRONX 2017 27.5
## 9 BROOKLYN 2010 17.6
## 10 BROOKLYN 2011 19.8
## 11 BROOKLYN 2012 19.5
## 12 BROOKLYN 2013 18.3
## 13 BROOKLYN 2014 21.2
## 14 BROOKLYN 2015 28
## 15 BROOKLYN 2016 32.7
## 16 BROOKLYN 2017 31.6
## 17 MANHATTAN 2010 46.3
## 18 MANHATTAN 2011 40.8
## 19 MANHATTAN 2012 47.6
## 20 MANHATTAN 2013 51.7
A_Rates %>%
ggplot(aes(x=Year, y=Rate, color = Borough, group = Borough)) +
geom_point() +
geom_line() +
labs(x = "Year", y = "Sightings per square kilometre", title = "Rate of sightings by Borough area", color = "Borough") +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
A_Rates %>%
ggplot(aes(x=Year, y=Rate, group = Borough)) +
geom_point() +
geom_line() +
facet_wrap(~ Borough) +
labs(x = "Year", y = "Sightings per square kilometre", title = "Rate of sightings by Borough area", color = "Borough") +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))